In [2]:
import numpy as np
import pandas as pd


#Data Reader from Internet
#pip install pandas-datareader
    
import matplotlib.pyplot as plt
import seaborn as sns

#visualization inside Jupyter Notebook
%matplotlib inline

#display image in Jupyter Notebook
from IPython.display import Image

###################
# Interative plots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# get version
from plotly import __version__
#print(__version__)

import cufflinks as cf

# For Notebooks
init_notebook_mode(connected=True) 
# For offline use
cf.go_offline()

#[[Plotly "after May 2020"
# + pip install chart-studio
#import chart-studio.plotly as py

###################
#Machine Learning (pip install scikit-learn)
#from sklearn.la_famille_de_modeles import le_modele

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC

##try a bunch of combinations and see what works best!
from sklearn.model_selection import GridSearchCV

from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import confusion_matrix


#from sklearn.datasets import load_boston #deprecated
# BUT other available (load_breast_cancer, load....)
In [ ]:
# MatPlotLib rearrange display
fig.tight_layout() 
# or 
plt.tight_layout() 
In [ ]:
# SeaBorn Load dataset
tips = sns.load_dataset('tips')
In [6]:
# import data using separator
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df_movies = pd.read_csv('datas/u.data', sep='\t', names=column_names)
df_movies.head()
Out[6]:
user_id item_id rating timestamp
0 0 50 5 881250949
1 0 172 5 881250949
2 0 133 1 881250949
3 196 242 3 881250949
4 186 302 3 891717742
In [34]:
movie_titles.head()
Out[34]:
item_id title
0 1 Toy Story (1995)
1 2 GoldenEye (1995)
2 3 Four Rooms (1995)
3 4 Get Shorty (1995)
4 5 Copycat (1995)
In [7]:
#merge datasets
movie_titles = pd.read_csv("datas/Movie_Id_Titles")
movie_titles.head()

df_movies = pd.merge(df_movies,movie_titles,on='item_id')
# if merge using index columns ==> can use join
# df_movies = df_movies.join(movie_titles['title'])
df_movies.head()
Out[7]:
user_id item_id rating timestamp title
0 0 50 5 881250949 Star Wars (1977)
1 290 50 5 880473582 Star Wars (1977)
2 79 50 4 891271545 Star Wars (1977)
3 2 50 5 888552084 Star Wars (1977)
4 8 50 5 879362124 Star Wars (1977)
In [10]:
# mean group by rating
df_movies.groupby('title')['rating'].mean().sort_values(ascending=False).head()
Out[10]:
title
They Made Me a Criminal (1939)                5.0
Marlene Dietrich: Shadow and Light (1996)     5.0
Saint of Fort Washington, The (1993)          5.0
Someone Else's America (1995)                 5.0
Star Kid (1997)                               5.0
Name: rating, dtype: float64
In [11]:
df_movies.groupby('title')['rating'].count().sort_values(ascending=False).head(4)
Out[11]:
title
Star Wars (1977)             584
Contact (1997)               509
Fargo (1996)                 508
Return of the Jedi (1983)    507
Name: rating, dtype: int64
In [18]:
df_movies[df_movies['title']=='Star Wars (1977)'].groupby(
    ['title','rating'])['item_id'].count()
Out[18]:
title             rating
Star Wars (1977)  1           9
                  2          16
                  3          57
                  4         176
                  5         326
Name: item_id, dtype: int64
In [26]:
# top 10 movies with 5 stars
df_movies[df_movies['rating']==5].groupby(
    ['title','rating'])['item_id'].count().sort_values(ascending=False).head(10)
Out[26]:
title                             rating
Star Wars (1977)                  5         326
Fargo (1996)                      5         227
Godfather, The (1972)             5         214
Raiders of the Lost Ark (1981)    5         202
Pulp Fiction (1994)               5         188
Schindler's List (1993)           5         186
Silence of the Lambs, The (1991)  5         181
Titanic (1997)                    5         179
Empire Strikes Back, The (1980)   5         173
Return of the Jedi (1983)         5         171
Name: item_id, dtype: int64
In [33]:
ratings = pd.DataFrame(df_movies.groupby('title')['rating'].mean())
ratings['num of ratings'] = pd.DataFrame(df_movies.groupby('title')['rating'].count())
ratings.sort_values(by='num of ratings', ascending=False).head(10)
Out[33]:
rating num of ratings
title
Star Wars (1977) 4.359589 584
Contact (1997) 3.803536 509
Fargo (1996) 4.155512 508
Return of the Jedi (1983) 4.007890 507
Liar Liar (1997) 3.156701 485
English Patient, The (1996) 3.656965 481
Scream (1996) 3.441423 478
Toy Story (1995) 3.878319 452
Air Force One (1997) 3.631090 431
Independence Day (ID4) (1996) 3.438228 429
In [5]:
train = pd.read_csv('datas/titanic_train.csv') #index_col=0)
In [44]:
train # 891 x 12
Out[44]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S
... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 24.0 1 2 W./C. 6607 23.4500 S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 Q

891 rows × 11 columns

In [4]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
In [5]:
train.describe()
Out[5]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [48]:
##################
# show null datas
##################
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[48]:
<AxesSubplot: >
In [ ]:
####################################
# drop a column
####################################
train.drop('Cabin',axis=1 #axis=0 for rows, axis=1 for columns
          ,inplace=True) #inplace to apply on data train (not a copy)
train # 891 x 11
In [47]:
####################################
# fill missing values (here Age replaced by mean age in Pclass)
####################################
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age
    
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)
train
Out[47]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S
... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 24.0 1 2 W./C. 6607 23.4500 S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 Q

891 rows × 11 columns

In [49]:
####################################
# drop null values (DROPS THE ROW containing null values)
####################################
train.dropna(inplace=True)
train # 889 x 11
Out[49]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S
... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 24.0 1 2 W./C. 6607 23.4500 S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 Q

889 rows × 11 columns

In [ ]:
train.drop('male',axis=1 #axis=0 for rows, axis=1 for columns
          ,inplace=True)
In [58]:
train
Out[58]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S
... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 24.0 1 2 W./C. 6607 23.4500 S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 Q

889 rows × 11 columns

In [8]:
####################################
# replace non numeric values (short)
####################################
loans = pd.read_csv('datas/loan_data.csv')

#cat_feats = ['purpose']

final_data = pd.get_dummies(loans,columns=['purpose']) # ,drop_first=True
In [7]:
loans['purpose'].unique()
Out[7]:
array(['debt_consolidation', 'credit_card', 'all_other',
       'home_improvement', 'small_business', 'major_purchase',
       'educational'], dtype=object)
In [9]:
final_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   credit.policy               9578 non-null   int64  
 1   int.rate                    9578 non-null   float64
 2   installment                 9578 non-null   float64
 3   log.annual.inc              9578 non-null   float64
 4   dti                         9578 non-null   float64
 5   fico                        9578 non-null   int64  
 6   days.with.cr.line           9578 non-null   float64
 7   revol.bal                   9578 non-null   int64  
 8   revol.util                  9578 non-null   float64
 9   inq.last.6mths              9578 non-null   int64  
 10  delinq.2yrs                 9578 non-null   int64  
 11  pub.rec                     9578 non-null   int64  
 12  not.fully.paid              9578 non-null   int64  
 13  purpose_all_other           9578 non-null   uint8  
 14  purpose_credit_card         9578 non-null   uint8  
 15  purpose_debt_consolidation  9578 non-null   uint8  
 16  purpose_educational         9578 non-null   uint8  
 17  purpose_home_improvement    9578 non-null   uint8  
 18  purpose_major_purchase      9578 non-null   uint8  
 19  purpose_small_business      9578 non-null   uint8  
dtypes: float64(6), int64(7), uint8(7)
memory usage: 1.0 MB
In [5]:
loans.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB
In [59]:
####################################
# replace non numeric values
####################################
pd.get_dummies(train['Sex']) # returns tab with columns = values and rows = true/false (1/0)
# for sex it's male or female, so we can drop first column
pd.get_dummies(train['Sex'],drop_first=True)
# save in tab
sex = pd.get_dummies(train['Sex'],drop_first=True)
# add it to our data
train = pd.concat([train,sex],axis=1)

train
Out[59]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked male
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C 0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S 0
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S 1
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 S 1
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 S 0
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 24.0 1 2 W./C. 6607 23.4500 S 0
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C 1
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 Q 1

889 rows × 12 columns

In [64]:
train['Embarked'].unique() #unique values in a column
Out[64]:
array(['S', 'C', 'Q'], dtype=object)
In [60]:
pd.get_dummies(train['Embarked'],drop_first=True)
Out[60]:
Q S
0 0 1
1 0 0
2 0 1
3 0 1
4 0 1
... ... ...
886 0 1
887 0 1
888 0 1
889 0 0
890 1 0

889 rows × 2 columns

In [93]:
# save in tab
embarked = pd.get_dummies(train['Embarked'],drop_first=True)
# add it to our data
train = pd.concat([train,embarked],axis=1)
train.drop('Embarked',axis=1,inplace=True)
train
Out[93]:
Survived Pclass Age SibSp Parch Fare male Q S
0 0 3 22.0 1 0 7.2500 1 0 1
1 1 1 38.0 1 0 71.2833 0 0 0
2 1 3 26.0 0 0 7.9250 0 0 1
3 1 1 35.0 1 0 53.1000 0 0 1
4 0 3 35.0 0 0 8.0500 1 0 1
... ... ... ... ... ... ... ... ... ...
886 0 2 27.0 0 0 13.0000 1 0 1
887 1 1 19.0 0 0 30.0000 0 0 1
888 0 3 24.0 1 2 23.4500 0 0 1
889 1 1 26.0 0 0 30.0000 1 0 0
890 0 3 32.0 0 0 7.7500 1 1 0

889 rows × 9 columns

In [92]:
train.drop('Q',axis=1,inplace=True)
train
Out[92]:
Survived Pclass Age SibSp Parch Fare Embarked male
0 0 3 22.0 1 0 7.2500 S 1
1 1 1 38.0 1 0 71.2833 C 0
2 1 3 26.0 0 0 7.9250 S 0
3 1 1 35.0 1 0 53.1000 S 0
4 0 3 35.0 0 0 8.0500 S 1
... ... ... ... ... ... ... ... ...
886 0 2 27.0 0 0 13.0000 S 1
887 1 1 19.0 0 0 30.0000 S 0
888 0 3 24.0 1 2 23.4500 S 0
889 1 1 26.0 0 0 30.0000 C 1
890 0 3 32.0 0 0 7.7500 Q 1

889 rows × 8 columns

In [ ]:
####################################
# dates - timestamps treatment
####################################
df['timeStamp']=pd.to_datetime(df['timeStamp']) # convert stringSeries to Time 


df['Hour']=df['timeStamp'].apply(lambda t : t.hour)
df['Month']=df['timeStamp'].apply(lambda t : t.month)
df['Day of Week']=df['timeStamp'].apply(lambda t : t.dayofweek)
dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
df['Day of Week']=df['Day of Week'].map(dmap)
In [ ]:
# fast cross checking datas
sns.pairplot(data=df,hue='COL_NAME',palette='bwr') # parametres bidon

image.png

In [ ]:
 
In [24]:
####################################
# incrusted plot
####################################
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_axes([0,0,1,1])
ax2 = fig.add_axes([.3,.3,.6,.5])
ax1.hist(train['Fare'],bins=30) # histogram of all data
ax2.hist(train[(train['Fare']<60)]['Fare'],bins=50) # incrusted zoom on Fares < 60 more populated
Out[24]:
(array([ 15.,   0.,   0.,   1.,   1.,  25., 247.,  40.,  35.,   9.,  55.,
         10.,  28.,  28.,   5.,   8.,   8.,  19.,   5.,   8.,  10.,  39.,
         28.,  16.,   8.,  18.,  14.,   5.,   6.,   6.,   2.,   2.,   5.,
         10.,   3.,   1.,   0.,   0.,   0.,   7.,   0.,   3.,   2.,  10.,
          8.,   0.,   3.,  13.,   2.,   1.]),
 array([ 0.   ,  1.188,  2.376,  3.564,  4.752,  5.94 ,  7.128,  8.316,
         9.504, 10.692, 11.88 , 13.068, 14.256, 15.444, 16.632, 17.82 ,
        19.008, 20.196, 21.384, 22.572, 23.76 , 24.948, 26.136, 27.324,
        28.512, 29.7  , 30.888, 32.076, 33.264, 34.452, 35.64 , 36.828,
        38.016, 39.204, 40.392, 41.58 , 42.768, 43.956, 45.144, 46.332,
        47.52 , 48.708, 49.896, 51.084, 52.272, 53.46 , 54.648, 55.836,
        57.024, 58.212, 59.4  ]),
 <BarContainer object of 50 artists>)
In [15]:
loans = pd.read_csv('datas/loan_data.csv')
####################################
# 2 histograms on same plot
####################################
b=25
a=0.5

# variable column
col='not.fully.paid'
#col='credit.policy'

plt.figure(figsize=(12,6))
loans[loans[col]==1]['fico'].hist(label=col+'=1',bins=b,alpha=a,color='red')
loans[loans[col]==0]['fico'].hist(label=col+'=0',bins=b,alpha=a,color='blue')
plt.legend()
plt.xlabel('FICO')
Out[15]:
Text(0.5, 0, 'FICO')
In [16]:
College_Data = pd.read_csv('datas/College_Data')
####################################
# 2 histograms on same plot
####################################
sns.set_style('darkgrid')
g = sns.FacetGrid(data=College_Data, hue="Private",
                  palette='coolwarm', height=6,
                  #size=6, #deprecated
                  aspect=2)
g = g.map(plt.hist,'Outstate',bins=20,alpha=0.7).add_legend()
In [12]:
loans = pd.read_csv('datas/loan_data.csv')
sns.lmplot(data=loans,x='fico',y='int.rate',
           hue='credit.policy',col='not.fully.paid')
Out[12]:
<seaborn.axisgrid.FacetGrid at 0x2a0603c97b0>
In [14]:
ad_data = pd.read_csv('datas/advertising.csv')
sns.pairplot(data=ad_data,hue='Clicked on Ad',palette='bwr')
Out[14]:
<seaborn.axisgrid.PairGrid at 0x2a061023c10>
In [16]:
####################################
#interactive version
####################################
import cufflinks as cf
cf.go_offline()
In [19]:
train[train['Fare']<60]['Fare'].iplot(kind='hist',bins=50,color='green')
In [11]:
Image(filename='imgs/13-Logistic-Regression--01-Logistic Regression with Python--iplot-hist.png')
Out[11]:
In [25]:
####################################
# Multiple interative plots
####################################
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot


from plotly import __version__
print(__version__)
5.11.0
In [ ]:
#hist1 = go.Histogram(x=train['Fare'],name='All Fares',)
hist1 = go.Bar()
hist2 = go.Histogram(x=train[train['Fare']<60]['Fare'],name='Fare < 60')

fig1 = go.Figure(data=hist1)
fig2 = go.Figure(data=hist2)

figs = cf.subplots([fig1, fig2],shape=(1,2))
iplot(figs)
In [3]:
# http://localhost:8888/notebooks/Refactored_Py_DS_ML_Bootcamp-master/13-Logistic-Regression/02-Logistic%20Regression%20Project.ipynb
# 
ad_data = pd.read_csv('datas/advertising.csv')
sns.jointplot(data=ad_data,x='Age',y='Daily Time Spent on Site',
              kind='kde',color='red',fill=True,
              marginal_kws=dict(alpha=0.1))
Out[3]:
<seaborn.axisgrid.JointGrid at 0x1bcf8378400>
In [13]:
Image(filename='imgs/13-Logistic-Regression--02-Logistic Regression Project--JoinPlot.png')
Out[13]:
In [ ]:
train.drop(['PassengerId','Name','Sex','Ticket'],axis=1,inplace=True)
In [78]:
pd.get_dummies(train['Embarked'],drop_first=True)
Out[78]:
Q S
0 0 1
1 0 0
2 0 1
3 0 1
4 0 1
... ... ...
886 0 1
887 0 1
888 0 1
889 0 0
890 1 0

889 rows × 2 columns

¶

Machine Learning¶

¶

LogisticRegression¶

¶

In [4]:
######## CONVERT LINEAR Regression to LOGISTIC Regression ########

Image('imgs/linear-to-logistic-1.JPG')
Out[4]:

Convert formula¶

$$\phi(z) = \frac {1} {1+ e^{-z}} $$
In [5]:
Image('imgs/linear-to-logistic-2.JPG')
Out[5]:
In [ ]:
######## LogisticRegression ########

#####################################
# Prepare trainings and tests data
#####################################
from sklearn.model_selection import train_test_split
y = train['Survived']
X = train.drop('Survived',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.30, 
                                                    random_state=101)

#####################################
# Train model
#####################################
from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression() # create instance of Logisitc model
logmodel.fit(X=X_train,y=y_train)

#####################################
# Run predictions
#####################################
predictions = logmodel.predict(X_test)

K Nearest Neighbors (KNN)¶

¶

In [7]:
######## K Nearest Neighbors (KNN) ########
df = pd.read_csv('datas/KNN_Project_Data')
#####################################
# Prepare trainings and tests data
#####################################
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

scaler = StandardScaler()
# fill scaler values
scaler.fit(df.drop('TARGET CLASS',axis=1))

#scale features to standardize everything to the same scale
#neither values with large scale will impact differently the values with small scale
scaled_features = scaler.transform(df.drop('TARGET CLASS',axis=1))
#==> returns values around 0 [-1.... .. +1.....]

df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])

X=scaled_features
y=df['TARGET CLASS']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=101)

#####################################
# Train model
#####################################
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X=X_train, y=y_train)

#####################################
# Run predictions
#####################################
predictions = knn.predict(X_test)
predictions


#####################################
# Find Best K value
#####################################
error_rate = []
iMaxLoop = 60

for i in range(1,iMaxLoop) :
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i!=y_test))

    print("**************************")
    print("**classification_report** WITH K=",str(i))
    print(classification_report(y_test,pred_i))
    print("**confusion_matrix** WITH K=",str(i))
    conf_matrix = confusion_matrix(y_test,pred_i)
    print(conf_matrix)
    FalNeg = conf_matrix[0,1]
    FalPos = conf_matrix[1,0]
    #print(confusion_matrix(y_test,pred))
    #FN = confusion_matrix [1,1]
    print("Errors :",str(FalNeg+FalPos))
 
plt.figure(figsize=(10,6))
plt.plot(range(1,iMaxLoop),error_rate,
        color='blue',linestyle='dashed', #linestyle='--',
         marker='o',markerfacecolor='red',markersize=10)
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.title('Error rate vs K value')
**************************
**classification_report** WITH K= 1
              precision    recall  f1-score   support

           0       0.73      0.72      0.72       152
           1       0.71      0.72      0.72       148

    accuracy                           0.72       300
   macro avg       0.72      0.72      0.72       300
weighted avg       0.72      0.72      0.72       300

**confusion_matrix** WITH K= 1
[[109  43]
 [ 41 107]]
Errors : 84
**************************
**classification_report** WITH K= 2
              precision    recall  f1-score   support

           0       0.67      0.85      0.75       152
           1       0.79      0.57      0.66       148

    accuracy                           0.71       300
   macro avg       0.73      0.71      0.70       300
weighted avg       0.73      0.71      0.70       300

**confusion_matrix** WITH K= 2
[[129  23]
 [ 64  84]]
Errors : 87
**************************
**classification_report** WITH K= 3
              precision    recall  f1-score   support

           0       0.80      0.77      0.78       152
           1       0.77      0.80      0.78       148

    accuracy                           0.78       300
   macro avg       0.78      0.78      0.78       300
weighted avg       0.78      0.78      0.78       300

**confusion_matrix** WITH K= 3
[[117  35]
 [ 30 118]]
Errors : 65
**************************
**classification_report** WITH K= 4
              precision    recall  f1-score   support

           0       0.75      0.86      0.80       152
           1       0.83      0.70      0.76       148

    accuracy                           0.78       300
   macro avg       0.79      0.78      0.78       300
weighted avg       0.79      0.78      0.78       300

**confusion_matrix** WITH K= 4
[[130  22]
 [ 44 104]]
Errors : 66
**************************
**classification_report** WITH K= 5
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       152
           1       0.79      0.78      0.79       148

    accuracy                           0.79       300
   macro avg       0.79      0.79      0.79       300
weighted avg       0.79      0.79      0.79       300

**confusion_matrix** WITH K= 5
[[122  30]
 [ 32 116]]
Errors : 62
**************************
**classification_report** WITH K= 6
              precision    recall  f1-score   support

           0       0.76      0.86      0.80       152
           1       0.83      0.72      0.77       148

    accuracy                           0.79       300
   macro avg       0.79      0.79      0.79       300
weighted avg       0.79      0.79      0.79       300

**confusion_matrix** WITH K= 6
[[130  22]
 [ 41 107]]
Errors : 63
**************************
**classification_report** WITH K= 7
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       152
           1       0.81      0.82      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 7
[[123  29]
 [ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 8
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       152
           1       0.83      0.78      0.80       148

    accuracy                           0.81       300
   macro avg       0.81      0.81      0.81       300
weighted avg       0.81      0.81      0.81       300

**confusion_matrix** WITH K= 8
[[128  24]
 [ 33 115]]
Errors : 57
**************************
**classification_report** WITH K= 9
              precision    recall  f1-score   support

           0       0.81      0.81      0.81       152
           1       0.81      0.81      0.81       148

    accuracy                           0.81       300
   macro avg       0.81      0.81      0.81       300
weighted avg       0.81      0.81      0.81       300

**confusion_matrix** WITH K= 9
[[123  29]
 [ 28 120]]
Errors : 57
**************************
**classification_report** WITH K= 10
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       152
           1       0.84      0.79      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 10
[[130  22]
 [ 31 117]]
Errors : 53
**************************
**classification_report** WITH K= 11
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       152
           1       0.81      0.82      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 11
[[123  29]
 [ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 12
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       152
           1       0.83      0.79      0.81       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 12
[[128  24]
 [ 31 117]]
Errors : 55
**************************
**classification_report** WITH K= 13
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       152
           1       0.81      0.82      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 13
[[123  29]
 [ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 14
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       152
           1       0.83      0.80      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 14
[[127  25]
 [ 29 119]]
Errors : 54
**************************
**classification_report** WITH K= 15
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       152
           1       0.81      0.83      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 15
[[123  29]
 [ 25 123]]
Errors : 54
**************************
**classification_report** WITH K= 16
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       152
           1       0.83      0.80      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 16
[[127  25]
 [ 29 119]]
Errors : 54
**************************
**classification_report** WITH K= 17
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 17
[[124  28]
 [ 23 125]]
Errors : 51
**************************
**classification_report** WITH K= 18
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       152
           1       0.83      0.82      0.82       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 18
[[127  25]
 [ 27 121]]
Errors : 52
**************************
**classification_report** WITH K= 19
              precision    recall  f1-score   support

           0       0.83      0.82      0.83       152
           1       0.82      0.82      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 19
[[125  27]
 [ 26 122]]
Errors : 53
**************************
**classification_report** WITH K= 20
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       152
           1       0.82      0.81      0.81       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 20
[[125  27]
 [ 28 120]]
Errors : 55
**************************
**classification_report** WITH K= 21
              precision    recall  f1-score   support

           0       0.84      0.81      0.82       152
           1       0.81      0.84      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 21
[[123  29]
 [ 24 124]]
Errors : 53
**************************
**classification_report** WITH K= 22
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       152
           1       0.81      0.82      0.81       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 22
[[124  28]
 [ 27 121]]
Errors : 55
**************************
**classification_report** WITH K= 23
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       152
           1       0.82      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 23
[[124  28]
 [ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 24
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       152
           1       0.82      0.82      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 24
[[125  27]
 [ 27 121]]
Errors : 54
**************************
**classification_report** WITH K= 25
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       152
           1       0.82      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 25
[[124  28]
 [ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 26
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 26
[[125  27]
 [ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 27
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       152
           1       0.82      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 27
[[124  28]
 [ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 28
              precision    recall  f1-score   support

           0       0.83      0.82      0.83       152
           1       0.82      0.83      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 28
[[125  27]
 [ 25 123]]
Errors : 52
**************************
**classification_report** WITH K= 29
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       152
           1       0.81      0.86      0.84       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 29
[[123  29]
 [ 21 127]]
Errors : 50
**************************
**classification_report** WITH K= 30
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 30
[[124  28]
 [ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 31
              precision    recall  f1-score   support

           0       0.87      0.81      0.84       152
           1       0.82      0.87      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 31
[[123  29]
 [ 19 129]]
Errors : 48
**************************
**classification_report** WITH K= 32
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       152
           1       0.82      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 32
[[124  28]
 [ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 33
              precision    recall  f1-score   support

           0       0.85      0.80      0.82       152
           1       0.81      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 33
[[122  30]
 [ 22 126]]
Errors : 52
**************************
**classification_report** WITH K= 34
              precision    recall  f1-score   support

           0       0.84      0.81      0.83       152
           1       0.81      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 34
[[123  29]
 [ 23 125]]
Errors : 52
**************************
**classification_report** WITH K= 35
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       152
           1       0.81      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 35
[[123  29]
 [ 22 126]]
Errors : 51
**************************
**classification_report** WITH K= 36
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 36
[[125  27]
 [ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 37
              precision    recall  f1-score   support

           0       0.86      0.82      0.84       152
           1       0.82      0.86      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 37
[[125  27]
 [ 21 127]]
Errors : 48
**************************
**classification_report** WITH K= 38
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       152
           1       0.83      0.84      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 38
[[126  26]
 [ 23 125]]
Errors : 49
**************************
**classification_report** WITH K= 39
              precision    recall  f1-score   support

           0       0.86      0.82      0.84       152
           1       0.82      0.86      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 39
[[125  27]
 [ 21 127]]
Errors : 48
**************************
**classification_report** WITH K= 40
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 40
[[125  27]
 [ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 41
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       152
           1       0.81      0.86      0.84       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 41
[[123  29]
 [ 21 127]]
Errors : 50
**************************
**classification_report** WITH K= 42
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 42
[[125  27]
 [ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 43
              precision    recall  f1-score   support

           0       0.86      0.82      0.84       152
           1       0.82      0.86      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 43
[[124  28]
 [ 21 127]]
Errors : 49
**************************
**classification_report** WITH K= 44
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 44
[[124  28]
 [ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 45
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       152
           1       0.82      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 45
[[124  28]
 [ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 46
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 46
[[125  27]
 [ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 47
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 47
[[124  28]
 [ 23 125]]
Errors : 51
**************************
**classification_report** WITH K= 48
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       152
           1       0.83      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 48
[[126  26]
 [ 24 124]]
Errors : 50
**************************
**classification_report** WITH K= 49
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 49
[[124  28]
 [ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 50
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       152
           1       0.83      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 50
[[126  26]
 [ 24 124]]
Errors : 50
**************************
**classification_report** WITH K= 51
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       152
           1       0.83      0.84      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 51
[[126  26]
 [ 23 125]]
Errors : 49
**************************
**classification_report** WITH K= 52
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       152
           1       0.83      0.84      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 52
[[127  25]
 [ 24 124]]
Errors : 49
**************************
**classification_report** WITH K= 53
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       152
           1       0.83      0.85      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 53
[[126  26]
 [ 22 126]]
Errors : 48
**************************
**classification_report** WITH K= 54
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       152
           1       0.84      0.84      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 54
[[128  24]
 [ 24 124]]
Errors : 48
**************************
**classification_report** WITH K= 55
              precision    recall  f1-score   support

           0       0.85      0.82      0.84       152
           1       0.82      0.85      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 55
[[125  27]
 [ 22 126]]
Errors : 49
**************************
**classification_report** WITH K= 56
              precision    recall  f1-score   support

           0       0.83      0.82      0.83       152
           1       0.82      0.83      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 56
[[125  27]
 [ 25 123]]
Errors : 52
**************************
**classification_report** WITH K= 57
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 57
[[125  27]
 [ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 58
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 58
[[125  27]
 [ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 59
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       152
           1       0.81      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 59
[[123  29]
 [ 22 126]]
Errors : 51
Out[7]:
Text(0.5, 1.0, 'Error rate vs K value')

Decision Trees and Random Forest¶

¶

Entropy and Information Gain are the Mathematical Methods of choosing the best split.¶

image.png

In [5]:
######## Decision Trees and Random Forests ########
df = pd.read_csv('datas/kyphosis.csv')
#####################################
from sklearn.model_selection import train_test_split

X = df.drop('Kyphosis',axis=1)
y = df['Kyphosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()

dtree.fit(X_train,y_train)

predictions = dtree.predict(X_test)

from sklearn.metrics import classification_report,confusion_matrix

print("**classification_report**",)
print(classification_report(y_test,predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,predictions))
**classification_report**
              precision    recall  f1-score   support

      absent       0.88      0.75      0.81        20
     present       0.38      0.60      0.46         5

    accuracy                           0.72        25
   macro avg       0.63      0.68      0.64        25
weighted avg       0.78      0.72      0.74        25

**confusion_matrix**
[[15  5]
 [ 2  3]]
In [7]:
#####################################
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

print("**classification_report**",)
print(classification_report(y_test,rfc_pred))
print("**confusion_matrix**")
print(confusion_matrix(y_test,rfc_pred))
**classification_report**
              precision    recall  f1-score   support

      absent       0.86      0.95      0.90        20
     present       0.67      0.40      0.50         5

    accuracy                           0.84        25
   macro avg       0.77      0.68      0.70        25
weighted avg       0.82      0.84      0.82        25

**confusion_matrix**
[[19  1]
 [ 3  2]]
In [ ]:
######## Support Vector Machines (SVM) ########
In [4]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix

cancer = load_breast_cancer()

df_feat = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])

df_target = pd.DataFrame(cancer['target'],columns=['Cancer'])

X_train, X_test, y_train, y_test = train_test_split(
    df_feat, np.ravel(df_target), test_size=0.30, random_state=101)

model = SVC()
model.fit(X_train,y_train)

predictions = model.predict(X_test)

print("**classification_report**",)
print(classification_report(y_test,predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,predictions))
**classification_report**
              precision    recall  f1-score   support

           0       0.95      0.85      0.90        66
           1       0.91      0.97      0.94       105

    accuracy                           0.92       171
   macro avg       0.93      0.91      0.92       171
weighted avg       0.93      0.92      0.92       171

**confusion_matrix**
[[ 56  10]
 [  3 102]]

Gridsearch¶

Finding the right parameters (like what C or gamma values to use) is a tricky task! But luckily, we can be a little lazy and just try a bunch of combinations and see what works best! This idea of creating a 'grid' of parameters and just trying out all the possible combinations is called a Gridsearch, this method is common enough that Scikit-learn has this functionality built in with GridSearchCV! The CV stands for cross-validation which is the

GridSearchCV takes a dictionary that describes the parameters that should be tried and a model to train. The grid of parameters is defined as a dictionary, where the keys are the parameters and the values are the settings to be tested.

One of the great things about GridSearchCV is that it is a meta-estimator. It takes an estimator like SVC, and creates a new estimator, that behaves exactly the same - in this case, like a classifier. You should add refit=True and choose verbose to whatever number you want, higher the number, the more verbose (verbose just means the text output describing the process).

What fit does is a bit more involved then usual. First, it runs the same loop with cross-validation, to find the best parameter combination. Once it has the best combination, it runs fit again on all data passed to fit (without cross-validation), to built a single new model using the best parameter setting.

In [9]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 
              'kernel': ['rbf']} 

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

# May take awhile!
grid.fit(X_train,y_train)
Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.887 total time=   0.0s
[CV 2/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.938 total time=   0.0s
[CV 3/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.963 total time=   0.0s
[CV 4/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.962 total time=   0.0s
[CV 5/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.886 total time=   0.0s
[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.900 total time=   0.0s
[CV 2/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.912 total time=   0.0s
[CV 3/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.925 total time=   0.0s
[CV 4/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.962 total time=   0.0s
[CV 5/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.937 total time=   0.0s
[CV 1/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.912 total time=   0.0s
[CV 2/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.950 total time=   0.0s
[CV 3/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.975 total time=   0.0s
[CV 4/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.962 total time=   0.0s
[CV 5/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.937 total time=   0.0s
[CV 1/5] END .........C=10, gamma=1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END .........C=10, gamma=1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END .........C=10, gamma=1, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END .........C=10, gamma=1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END .........C=10, gamma=1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.613 total time=   0.0s
[CV 4/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.887 total time=   0.0s
[CV 2/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.912 total time=   0.0s
[CV 3/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.900 total time=   0.0s
[CV 4/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.937 total time=   0.0s
[CV 5/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.924 total time=   0.0s
[CV 1/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.950 total time=   0.0s
[CV 2/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.912 total time=   0.0s
[CV 3/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.975 total time=   0.0s
[CV 4/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.949 total time=   0.0s
[CV 5/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.949 total time=   0.0s
[CV 1/5] END ........C=100, gamma=1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END ........C=100, gamma=1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END ........C=100, gamma=1, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END ........C=100, gamma=1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END ........C=100, gamma=1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.613 total time=   0.0s
[CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.887 total time=   0.0s
[CV 2/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.912 total time=   0.0s
[CV 3/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.900 total time=   0.0s
[CV 4/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.937 total time=   0.0s
[CV 5/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.924 total time=   0.0s
[CV 1/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.925 total time=   0.0s
[CV 2/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.912 total time=   0.0s
[CV 3/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.975 total time=   0.0s
[CV 4/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.937 total time=   0.0s
[CV 5/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.949 total time=   0.0s
[CV 1/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.625 total time=   0.0s
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.637 total time=   0.0s
[CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.637 total time=   0.0s
[CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.613 total time=   0.0s
[CV 4/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.633 total time=   0.0s
[CV 5/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.633 total time=   0.0s
[CV 1/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.887 total time=   0.0s
[CV 2/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.912 total time=   0.0s
[CV 3/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.900 total time=   0.0s
[CV 4/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.937 total time=   0.0s
[CV 5/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.924 total time=   0.0s
[CV 1/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.938 total time=   0.0s
[CV 2/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.912 total time=   0.0s
[CV 3/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.963 total time=   0.0s
[CV 4/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.924 total time=   0.0s
[CV 5/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.962 total time=   0.0s
Out[9]:
GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)
SVC()
SVC()
In [11]:
# You can inspect the best parameters found by GridSearchCV in the best_params_ attribute, 
#and the best estimator in the best_estimator_ attribute:

print(grid.best_estimator_)
print('\n')
print(grid.best_params_)
print('\n')
print(grid.best_score_)
SVC(C=1, gamma=0.0001)


{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}


0.9472468354430379
In [12]:
# Check predictions results of the grid
grid_predictions = grid.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import confusion_matrix


print("**classification_report**",)
print(classification_report(y_test,grid_predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,grid_predictions))
**classification_report**
              precision    recall  f1-score   support

           0       0.94      0.89      0.91        66
           1       0.94      0.96      0.95       105

    accuracy                           0.94       171
   macro avg       0.94      0.93      0.93       171
weighted avg       0.94      0.94      0.94       171

**confusion_matrix**
[[ 59   7]
 [  4 101]]

K Means Clustering¶

Method Used¶

K Means Clustering is an unsupervised learning algorithm that tries to cluster data based on their similarity. Unsupervised learning means that there is no outcome to be predicted, and the algorithm just tries to find patterns in the data. In k means clustering, we have the specify the number of clusters we want the data to be grouped into. The algorithm randomly assigns each observation to a cluster, and finds the centroid of each cluster. Then, the algorithm iterates through two steps: Reassign data points to the cluster whose centroid is closest. Calculate new centroid of each cluster. These two steps are repeated till the within cluster variation cannot be reduced any further. The within cluster variation is calculated as the sum of the euclidean distance between the data points and their respective cluster centroids.

Create some Data¶

In [3]:
from sklearn.datasets import make_blobs

# Create Data : creates array of 
# 1 array of data (200 lines/samples & 2 columns/features)
# 1 array of center (4 centers = data can belong to 4 categories)
data = make_blobs(n_samples=200, #array of 200 lines
                  n_features=2, #
                           centers=4, cluster_std=1.8,random_state=101)

plt.scatter(x=data[0][:,0],
            y=data[0][:,1],
           c=data[1],
           cmap='rainbow')
plt.xlabel('data[0] 1st column')
plt.ylabel('data[0] 2nd column')
plt.title('200 datas x 2 features of 4 centers generated')
Out[3]:
Text(0.5, 1.0, '200 datas x 2 features of 4 centers generated')

Creating the Clusters¶

In [4]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, n_init=10)

kmeans.fit(data[0])


fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True,figsize=(10,6))

# plot datas and the kmeans labels found
ax1.set_title('K Means')
ax1.scatter(data[0][:,0], #data[0] 1st column
            data[0][:,1], #data[0] 2nd column
            c=kmeans.labels_, #cluster found
            cmap='rainbow')

# VS plot datas and their cluster
ax2.set_title("Original")
ax2.scatter(data[0][:,0],data[0][:,1],c=data[1],cmap='rainbow')
Out[4]:
<matplotlib.collections.PathCollection at 0x226cf3674f0>
In [6]:
kmeans.cluster_centers_ #centers found
Out[6]:
array([[-4.13591321,  7.95389851],
       [-9.46941837, -6.56081545],
       [-0.0123077 ,  2.13407664],
       [ 3.71749226,  7.01388735]])

PCA Visualization¶

As we've noticed before it is difficult to visualize high dimensional data, we can use PCA to find the first two principal components, and visualize the data in this new, two-dimensional space, with a single scatter-plot. Before we do this though, we'll need to scale our data so that each feature has a single unit variance.

In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

cancer = load_breast_cancer()

df = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])


#scale features to standardize everything to the same scale
#neither values with large scale will impact differently the values with small scale
#==> returns values around 0 [-1.... .. +1.....]
scaler = StandardScaler()
scaler.fit(df)
scaled_data = scaler.transform(df)
#scaled_data.shape #(569,30) #30 components

pca = PCA(n_components=2) #find 2 principal components
pca.fit(scaled_data)

x_pca = pca.transform(scaled_data)
#x_pca.shape #(569, 2) #2 components

#show target color VS distribution of the 2 principal components found
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=cancer['target'],cmap='plasma')
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')
Out[3]:
Text(0, 0.5, 'Second Principal Component')

Clearly by using these two components we can easily separate these two classes.

Interpreting the components¶

Unfortunately, with this great power of dimensionality reduction, comes the cost of being able to easily understand what these components represent.

The components correspond to combinations of the original features, the components themselves are stored as an attribute of the fitted PCA object:

In [4]:
df_comp = pd.DataFrame(pca.components_,columns=cancer['feature_names'])
df_comp
Out[4]:
mean radius mean texture mean perimeter mean area mean smoothness mean compactness mean concavity mean concave points mean symmetry mean fractal dimension ... worst radius worst texture worst perimeter worst area worst smoothness worst compactness worst concavity worst concave points worst symmetry worst fractal dimension
0 0.218902 0.103725 0.227537 0.220995 0.142590 0.239285 0.258400 0.260854 0.138167 0.064363 ... 0.227997 0.104469 0.236640 0.224871 0.127953 0.210096 0.228768 0.250886 0.122905 0.131784
1 -0.233857 -0.059706 -0.215181 -0.231077 0.186113 0.151892 0.060165 -0.034768 0.190349 0.366575 ... -0.219866 -0.045467 -0.199878 -0.219352 0.172304 0.143593 0.097964 -0.008257 0.141883 0.275339

2 rows × 30 columns

In [5]:
plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma')
Out[5]:
<AxesSubplot: >
In [ ]:
#####################################
#####################################
# Check predictions results
#####################################
#####################################

Classification Error Metrics¶

  • True positives (TP)
  • True negatives (TN)
  • False positives (FP) *[Type I error]* (A man is pregnant)
  • False negatives (FN) *[Type II error]* (A pregnant woman isn't pregnant)

image.png

(Image in MarkDown Cell)¶

In [7]:
# Image in code cell
Image(filename='imgs/confusionMatrix.JPG')
Out[7]:

Accuracy¶

  • Overall, how often is it correct ?
  • (correct predictions) / (total predictions)
  • (TP + TN) / total

= 150 /165 = 0.91

Accuracy is useful when target classes are well balanced, BUT not a good choice with **unbalanced** classes!

Misclassification Rate (Error Rate)¶

  • Overall, how often is it wrong ?
  • (wrong predictions) / (total predictions)
  • (FP + FN) / total

= 15 /165 = 0.09

Recall¶

  • Ability of a model to find all the relevant cases within a dataset.

TP / (TP + FN)
= 100 / 105 = 0.95

Precision¶

  • Proportion data points were relevant that our model says was relevant

TP / (TP + FP)
= 100 / 110 = 0.91

Recall & Precision¶

While recall expresses the ability to find all relevant instances in a dataset
precision expresses the proportion of the data points our model says was relevant actually were relevant.

F1-Score¶

  • "harmonic mean" of Precision and Recall ==> punish extrem values
$${F}_1 = 2 * \frac {precision * recall} {precision + recall}$$

image.png

In [9]:
Image('imgs/confusionMatrixFormulas.png')
Out[9]:

Regression Evaluation Metrics¶

Here are three common evaluation metrics for regression problems:

Mean Absolute Error (MAE) is the mean of the absolute value of the errors:

$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$

(somme des différences (en valeur absolue) entre valeur prédite et valeur réelle / (nombre de prédictions)
==> large errors not punished)

Mean Squared Error (MSE) is the mean of the squared errors:

$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$

(somme des {différences (en valeur absolue) entre valeur prédite et valeur réelle}^2 / (nombre de prédictions)
==> unité -> unité^2)

Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:

$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$

RacineCarré[somme des {différences* entre valeur prédite et valeur réelle}^2 / (nombre de prédictions)]

Comparing these metrics:

  • MAE is the easiest to understand, because it's the average error.
  • MSE is more popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world.
  • RMSE is even more popular than MSE, because RMSE is interpretable in the "y" units.

All of these are loss functions, because we want to minimize them.

In [100]:
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import confusion_matrix


print("**classification_report**",)
print(classification_report(y_test,predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,predictions))
**classification_report**
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       163
           1       0.82      0.71      0.76       104

    accuracy                           0.83       267
   macro avg       0.83      0.81      0.81       267
weighted avg       0.83      0.83      0.83       267

**confusion_matrix**
[[147  16]
 [ 30  74]]

Recommender Systems¶

2 types¶

  • Content-Based : Focus on the attributes of the item and give recommendations based on the similarity between them

  • Collaborative Filtering : Produces recommendations based on the knowledge of users' attitude

In [38]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('datas/u.data', sep='\t', names=column_names)

movie_titles = pd.read_csv('datas/Movie_Id_Titles')

df = pd.merge(df,movie_titles,on='item_id')
df.head()
Out[38]:
user_id item_id rating timestamp title
0 0 50 5 881250949 Star Wars (1977)
1 290 50 5 880473582 Star Wars (1977)
2 79 50 4 891271545 Star Wars (1977)
3 2 50 5 888552084 Star Wars (1977)
4 8 50 5 879362124 Star Wars (1977)
In [42]:
ratings = pd.DataFrame(df.groupby('title')['rating'].mean())

ratings['num of ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())

ratings.sort_values(by='num of ratings', ascending=False).head(10)
Out[42]:
rating num of ratings
title
Star Wars (1977) 4.359589 584
Contact (1997) 3.803536 509
Fargo (1996) 4.155512 508
Return of the Jedi (1983) 4.007890 507
Liar Liar (1997) 3.156701 485
English Patient, The (1996) 3.656965 481
Scream (1996) 3.441423 478
Toy Story (1995) 3.878319 452
Air Force One (1997) 3.631090 431
Independence Day (ID4) (1996) 3.438228 429
In [40]:
# for each user_id his rate for each movie
moviemat = df.pivot_table(index='user_id',
                          columns='title',values='rating')
moviemat.head()
Out[40]:
title 'Til There Was You (1997) 1-900 (1994) 101 Dalmatians (1996) 12 Angry Men (1957) 187 (1997) 2 Days in the Valley (1996) 20,000 Leagues Under the Sea (1954) 2001: A Space Odyssey (1968) 3 Ninjas: High Noon At Mega Mountain (1998) 39 Steps, The (1935) ... Yankee Zulu (1994) Year of the Horse (1997) You So Crazy (1994) Young Frankenstein (1974) Young Guns (1988) Young Guns II (1990) Young Poisoner's Handbook, The (1995) Zeus and Roxanne (1997) unknown Á köldum klaka (Cold Fever) (1994)
user_id
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 NaN NaN 2.0 5.0 NaN NaN 3.0 4.0 NaN NaN ... NaN NaN NaN 5.0 3.0 NaN NaN NaN 4.0 NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN 2.0 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 1664 columns

In [43]:
starwars_user_ratings = moviemat['Star Wars (1977)']
liarliar_user_ratings = moviemat['Liar Liar (1997)']
starwars_user_ratings.head(20)
Out[43]:
user_id
0     5.0
1     5.0
2     5.0
3     NaN
4     5.0
5     4.0
6     4.0
7     5.0
8     5.0
9     5.0
10    5.0
11    NaN
12    4.0
13    5.0
14    5.0
15    5.0
16    NaN
17    NaN
18    4.0
19    NaN
Name: Star Wars (1977), dtype: float64
In [44]:
##################################################
#RECOMMENDER SYSTEM BASED ON CORRELATION CORRWITH
##################################################
similar_to_starwars = moviemat.corrwith(starwars_user_ratings)
similar_to_liarliar = moviemat.corrwith(liarliar_user_ratings)
similar_to_starwars
C:\Python310\lib\site-packages\numpy\lib\function_base.py:2845: RuntimeWarning:

Degrees of freedom <= 0 for slice

C:\Python310\lib\site-packages\numpy\lib\function_base.py:2704: RuntimeWarning:

divide by zero encountered in divide

C:\Python310\lib\site-packages\numpy\lib\function_base.py:2845: RuntimeWarning:

Degrees of freedom <= 0 for slice

C:\Python310\lib\site-packages\numpy\lib\function_base.py:2704: RuntimeWarning:

divide by zero encountered in divide

Out[44]:
title
'Til There Was You (1997)                0.872872
1-900 (1994)                            -0.645497
101 Dalmatians (1996)                    0.211132
12 Angry Men (1957)                      0.184289
187 (1997)                               0.027398
                                           ...   
Young Guns II (1990)                     0.228615
Young Poisoner's Handbook, The (1995)   -0.007374
Zeus and Roxanne (1997)                  0.818182
unknown                                  0.723123
Á köldum klaka (Cold Fever) (1994)            NaN
Length: 1664, dtype: float64
In [45]:
corr_starwars = pd.DataFrame(similar_to_starwars,columns=['Correlation'])
corr_starwars.dropna(inplace=True) #remove null values
corr_starwars.head()
Out[45]:
Correlation
title
'Til There Was You (1997) 0.872872
1-900 (1994) -0.645497
101 Dalmatians (1996) 0.211132
12 Angry Men (1957) 0.184289
187 (1997) 0.027398
In [ ]:
 #add column ratings['num of ratings'] merging based on index
corr_starwars = corr_starwars.join(ratings['num of ratings'])
In [54]:
corr_starwars.sort_values('Correlation',ascending=False) 
#some weird correlation :  This is because there are a lot of movies
#only watched once by users who also watched star wars (it was the most popular movie)
Out[54]:
Correlation num of ratings
title
Hollow Reed (1996) 1.0 6
Commandments (1997) 1.0 3
Cosi (1996) 1.0 4
No Escape (1994) 1.0 5
Stripes (1981) 1.0 5
... ... ...
For Ever Mozart (1996) -1.0 3
Frankie Starlight (1995) -1.0 4
I Like It Like That (1994) -1.0 3
American Dream (1990) -1.0 2
Theodore Rex (1995) -1.0 5

1410 rows × 2 columns

In [62]:
#most rated movies
plt.figure(figsize=(10,4))
ratings['num of ratings'].hist(bins=70)
plt.xlabel('Num of ratings')
Out[62]:
Text(0.5, 0, 'Num of ratings')
In [63]:
# check correlation of movies having more than 100 ratings
corr_starwars[corr_starwars['num of ratings']>100].sort_values(
    'Correlation',ascending=False).head()
Out[63]:
Correlation num of ratings
title
Star Wars (1977) 1.000000 584
Empire Strikes Back, The (1980) 0.748353 368
Return of the Jedi (1983) 0.672556 507
Raiders of the Lost Ark (1981) 0.536117 420
Austin Powers: International Man of Mystery (1997) 0.377433 130
In [ ]: